{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "import os\n",
    "import pickle\n",
    "import json\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('language_dict.json','r') as fopen:\n",
    "    languages = json.load(fopen)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### You can get the dataset from [here](https://tatoeba.org/eng/downloads)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>1</th>\n",
       "      <th>cmn</th>\n",
       "      <th>我們試試看！</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>cmn</td>\n",
       "      <td>我该去睡觉了。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>cmn</td>\n",
       "      <td>你在干什麼啊？</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4</td>\n",
       "      <td>cmn</td>\n",
       "      <td>這是什麼啊？</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5</td>\n",
       "      <td>cmn</td>\n",
       "      <td>今天是６月１８号，也是Muiriel的生日！</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6</td>\n",
       "      <td>cmn</td>\n",
       "      <td>生日快乐，Muiriel！</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   1  cmn                  我們試試看！\n",
       "0  2  cmn                 我该去睡觉了。\n",
       "1  3  cmn                 你在干什麼啊？\n",
       "2  4  cmn                  這是什麼啊？\n",
       "3  5  cmn  今天是６月１８号，也是Muiriel的生日！\n",
       "4  6  cmn           生日快乐，Muiriel！"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lang = pd.read_csv('sentences.csv',sep='\\t')\n",
    "lang = lang.dropna()\n",
    "lang.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "selected_langs = ['zlm','eng','ind']\n",
    "lang.loc[~lang.cmn.isin(selected_langs),'cmn'] = 'OTHER'\n",
    "selected_langs.append('OTHER')\n",
    "sentences, langs = [], []\n",
    "for i in selected_langs:\n",
    "    filtered = lang.loc[lang.cmn == i]\n",
    "    sentences += filtered.iloc[:100000,-1].tolist()\n",
    "    langs += filtered.iloc[:100000,1].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "del lang"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array(['OTHER', 'eng', 'ind', 'zlm'], dtype='<U5'),\n",
       " array([100000, 100000,  11808,     91]))"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(langs,return_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "for file in ['negative','positive']:\n",
    "    with open(file,'r') as fopen:\n",
    "        bm = (' '.join(fopen.read().split('\\n'))).split()\n",
    "        new_langs = [' '.join(bm[i:i+4]) for i in range(0, len(bm), 4)] \n",
    "        sentences += new_langs\n",
    "        langs += ['zlm'] * len(new_langs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def simple_textcleaning_language_detection(string):\n",
    "    string = re.sub('[^A-Za-z ]+', ' ', string)\n",
    "    string = filter(None, string.split())\n",
    "    string = [y.strip() for y in string if len(y) > 1]\n",
    "    return ' '.join(string).lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "bm = ''\n",
    "for i in [i for i in os.listdir(os.getcwd()) if i.find('isu')>=0]:\n",
    "    with open(i,'r') as fopen:\n",
    "        isu = json.load(fopen)\n",
    "    bm += ' '.join([simple_textcleaning_language_detection(i['summary']) for i in isu if i['language']=='id'])\n",
    "bm = bm.split()\n",
    "new_langs = [' '.join(bm[i:i+4]) for i in range(0, len(bm), 4)] \n",
    "sentences += new_langs\n",
    "langs += ['zlm'] * len(new_langs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array(['OTHER', 'eng', 'ind', 'zlm'], dtype='<U5'),\n",
       " array([100000, 100000,  11808,  94281]))"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(langs,return_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(306089, 934458)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target = LabelEncoder().fit_transform(langs)\n",
    "bow_chars = CountVectorizer(ngram_range=(2, 4), analyzer='char').fit(sentences)\n",
    "vectors = bow_chars.transform(sentences)\n",
    "vectors.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = 0.2)\n",
    "del vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "      OTHER       1.00      0.99      0.99     79852\n",
      "        eng       0.98      1.00      0.99     80065\n",
      "        ind       0.95      0.52      0.67      9458\n",
      "        zlm       0.94      0.99      0.97     75496\n",
      "\n",
      "avg / total       0.97      0.97      0.97    244871\n",
      "\n"
     ]
    }
   ],
   "source": [
    "multinomial = MultinomialNB().fit(train_X, train_Y)\n",
    "print(metrics.classification_report(train_Y, multinomial.predict(train_X), target_names = np.unique(langs)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "      OTHER       1.00      0.98      0.99     20148\n",
      "        eng       0.98      1.00      0.99     19935\n",
      "        ind       0.91      0.49      0.64      2350\n",
      "        zlm       0.94      0.99      0.97     18785\n",
      "\n",
      "avg / total       0.97      0.97      0.97     61218\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(metrics.classification_report(test_Y, multinomial.predict(test_X), target_names = np.unique(langs)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('multinomial-language-detection.pkl','wb') as fopen:\n",
    "    pickle.dump(multinomial,fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('bow-language-detection.pkl','wb') as fopen:\n",
    "    pickle.dump(bow_chars,fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
